{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "聚类\n",
    "熟悉各中聚类算法的调用 并用评价指标选择合适的超参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#导入必要的工具包\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import metrics\n",
    "\n",
    "from sklearn.decomposition import PCA\n",
    "import time\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "#读取训练数据\n",
    "train = pd.read_csv('~/Desktop/traindata.csv')\n",
    "\n",
    "X_train = train.drop(['event_id','user_id','start_time','city','state','zip','country','lat','lng'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#将像素值[0,255]  --> [0,1]\n",
    "X_train = X_train / 255.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the shape of train_image: (13418, 102)\n"
     ]
    }
   ],
   "source": [
    "# 原始输入的特征维数和样本数目\n",
    "print('the shape of train_image: {}'.format(X_train.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(13418, 102)\n"
     ]
    }
   ],
   "source": [
    "#拆分后的训练集和校验集的样本数目\n",
    "print(X_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 一个参数点（聚类数据为K）的模型，在校验集上评价聚类算法性能\n",
    "def K_cluster_analysis(K, X_train):\n",
    "    start = time.time()\n",
    "    \n",
    "    \n",
    "    #K-means,在训练集上训练\n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    mb_kmeans.fit(X_train)\n",
    "    \n",
    "    \n",
    "    y_data= mb_kmeans.predict(X_train)\n",
    "    \n",
    "    #以前两维特征打印训练数据的分类结果\n",
    "    #plt.scatter(X_train[:, 0], X_train[:, 1], c=y_pred)\n",
    "    #plt.show()\n",
    "\n",
    "    # K值的评估标准\n",
    "    #常见的方法有轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    #CH_score = metrics.calinski_harabaz_score(X_train,mb_kmeans.predict(X_train))\n",
    "    CH_score = metrics.silhouette_score(X_train,y_data)\n",
    "    \n",
    "    \n",
    "    end = time.time()\n",
    "    print(\"K-means begin with clusters:{}\".format(K))\n",
    "    print(\"CH_score: {}, time elaps:{}\".format(CH_score, int(end-start)))\n",
    "   # print(\"CH_score: {}\".format(CH_score))\n",
    "    \n",
    "    return CH_score,y_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters:10\n",
      "CH_score: 0.5226609216294443, time elaps:9\n",
      "K-means begin with clusters:20\n",
      "CH_score: 0.4959557926144086, time elaps:8\n",
      "K-means begin with clusters:30\n",
      "CH_score: 0.470058109087869, time elaps:6\n",
      "K-means begin with clusters:40\n",
      "CH_score: 0.4528539673389327, time elaps:6\n",
      "K-means begin with clusters:50\n",
      "CH_score: 0.4315355334584844, time elaps:6\n",
      "K-means begin with clusters:60\n",
      "CH_score: 0.4176404477379842, time elaps:8\n",
      "K-means begin with clusters:70\n",
      "CH_score: 0.39387831906344895, time elaps:9\n",
      "K-means begin with clusters:80\n",
      "CH_score: 0.38725241102226277, time elaps:8\n",
      "K-means begin with clusters:90\n",
      "CH_score: 0.3727929849082477, time elaps:8\n",
      "K-means begin with clusters:100\n",
      "CH_score: 0.36628446521920976, time elaps:10\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "Ks = [10,20,30,40,50,60,70,80,90,100]\n",
    "CH_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, X_train)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "setting an array element with a sequence.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-54-eff1b68f9208>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mKs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mCH_scores\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'b-'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m: setting an array element with a sequence."
     ]
    }
   ],
   "source": [
    "plt.plot(Ks, np.array(CH_scores), 'b-')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Expected 2D array, got 1D array instead:\narray=[4. 4. 4. ... 2. 2. 2.].\nReshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-56-aeedf8dd14d3>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mn_clusters\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0mmb_kmeans\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mMiniBatchKMeans\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_clusters\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn_clusters\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mmb_kmeans\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0mcents\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmb_kmeans\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcluster_centers_\u001b[0m\u001b[0;31m#质心\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/sklearn/cluster/k_means_.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m   1352\u001b[0m         \u001b[0mrandom_state\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_random_state\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1353\u001b[0m         X = check_array(X, accept_sparse=\"csr\", order='C',\n\u001b[0;32m-> 1354\u001b[0;31m                         dtype=[np.float64, np.float32])\n\u001b[0m\u001b[1;32m   1355\u001b[0m         \u001b[0mn_samples\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_features\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1356\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mn_samples\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_clusters\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m    439\u001b[0m                     \u001b[0;34m\"Reshape your data either using array.reshape(-1, 1) if \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    440\u001b[0m                     \u001b[0;34m\"your data has a single feature or array.reshape(1, -1) \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 441\u001b[0;31m                     \"if it contains a single sample.\".format(array))\n\u001b[0m\u001b[1;32m    442\u001b[0m             \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0matleast_2d\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    443\u001b[0m             \u001b[0;31m# To ensure that array flags are maintained\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: Expected 2D array, got 1D array instead:\narray=[4. 4. 4. ... 2. 2. 2.].\nReshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample."
     ]
    }
   ],
   "source": [
    "#显示聚类结果\n",
    "#画出聚类结果，每一类用一种颜色\n",
    "colors = ['b','g','r','k','c','m','y','#e24fff','#524C90','#845868']\n",
    "\n",
    "n_clusters = 10\n",
    "mb_kmeans = MiniBatchKMeans(n_clusters = n_clusters)\n",
    "mb_kmeans.fit(X_train)\n",
    "\n",
    "cents = mb_kmeans.cluster_centers_#质心\n",
    "\n",
    "for i in range(n_clusters):\n",
    "    index = np.nonzero(y_data==i)[0]\n",
    "    x = X_train[index,0]\n",
    "    y_i = y_data[index]\n",
    "    for j in range(len(x1)):\n",
    "        if j < 20:  #每类打印20个\n",
    "            plt.text(x[j],str(int(y_i[j])),color=colors[i],\\\n",
    "                fontdict={'weight': 'bold', 'size': 9})\n",
    "    #plt.scatter(cents[i,0],cents[i,1],marker='x',color=colors[i],linewidths=12)\n",
    "\n",
    "plt.axis([-5,10,-6,6])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
